# Importing Dataset
listing <- read.csv("C:/Users/lawye/Downloads/listings.csv", header = T)
calendar <- read.csv("C:/Users/lawye/Downloads/calendar.csv", header = T)
model <- lm(price ~ room_type+minimum_nights+number_of_reviews+reviews_per_month+calculated_host_listings_count+availability_365, data = listing)
summary(model)
##
## Call:
## lm(formula = price ~ room_type + minimum_nights + number_of_reviews +
## reviews_per_month + calculated_host_listings_count + availability_365,
## data = listing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -261.4 -84.7 -34.3 15.3 9933.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 203.12477 2.95291 68.788 < 2e-16 ***
## room_typePrivate room -134.64909 3.09622 -43.488 < 2e-16 ***
## room_typeShared room -200.48755 7.61709 -26.321 < 2e-16 ***
## minimum_nights -0.15554 0.06529 -2.382 0.0172 *
## number_of_reviews -0.22882 0.03134 -7.301 2.92e-13 ***
## reviews_per_month -9.17536 0.77029 -11.912 < 2e-16 ***
## calculated_host_listings_count 0.76611 0.14082 5.440 5.35e-08 ***
## availability_365 0.18005 0.01068 16.861 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 268.8 on 35229 degrees of freedom
## (9267 observations deleted due to missingness)
## Multiple R-squared: 0.0763, Adjusted R-squared: 0.07611
## F-statistic: 415.7 on 7 and 35229 DF, p-value: < 2.2e-16
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.3
attach(listing)
# Price distribution by different types of a room
head(listing)
ggplot(listing, aes(x=price, fill=room_type)) + geom_density(alpha=0.5) + labs(x="Room Price",y="Density") + scale_fill_discrete(name = "Room Type") +ggtitle("Airbnb Room Price Distribution") + theme(plot.title = element_text(hjust = 0.5)) + xlim(c(0,500))
## Warning: Removed 2280 rows containing non-finite values (stat_density).

library(leaflet)
## Warning: package 'leaflet' was built under R version 3.4.4
# Geographical mapping of Airbnb dataset
airbnb_map <- leaflet(listing) %>% addTiles(attribution='Airbnb Dataset') %>% addMarkers(~longitude, ~latitude, popup = ~as.character(price), label = ~as.character(name), clusterOptions = markerClusterOptions())
airbnb_map
# Determine whether there is significant difference in room price between the rooms in Santa Catalina Island and the rooms outside of an island
indices <- grepl('two harbors|avalon|catalina', neighbourhood, ignore.case = T)
listing$Catalina <- ifelse(indices, 'Yes', 'No')
ggplot(aes(x=Catalina, y=price, fill=Catalina), data = listing) + geom_boxplot() +
ylim(c(0,500)) + xlab("Santa Catalina Island") + ggtitle("Room Price in Catalina Island") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "IslandOrNot")
## Warning: Removed 2280 rows containing non-finite values (stat_boxplot).

listing %>% group_by(Catalina) %>% summarise(AvgPrice = mean(price))
ggplot(listing, aes(x=price, fill=Catalina)) + geom_density(alpha=0.5) + labs(x="Room Price",y="Density") + scale_fill_discrete(name = "Catalina Island") +ggtitle("Airbnb Room Price Distribution") + theme(plot.title = element_text(hjust = 0.5)) + xlim(c(0,1000))
## Warning: Removed 890 rows containing non-finite values (stat_density).

# Determine if Airbnb rooms near beach are generally more expensive than those that are distant from beach
indices2 <- grepl('beach', name, ignore.case = T)
listing$Beach <- ifelse(indices2, 'Yes', 'No')
ggplot(aes(x=Beach, y=price, fill=Beach), data = listing) + geom_boxplot() +
ylim(c(0,500)) + xlab("Beach") + ggtitle("Room Price By Beach") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "BeachOrNot")
## Warning: Removed 2280 rows containing non-finite values (stat_boxplot).

listing %>% group_by(Beach) %>% summarise(AvgPrice = mean(price))
# Divide Airbnb room availability into 5 different categorical variables and determine how much they affect
# the room price
listing$Availability <- cut(availability_365, breaks=c(0,20,72,146,220,294,365), labels = c("0-20", "21-72","73-146","147-220","221-294","295-365"))
table(listing$Availability, exclude = NULL)
##
## 0-20 21-72 73-146 147-220 221-294 295-365 <NA>
## 1757 4758 7026 5586 3171 14711 7495
new_list <- listing %>% group_by(Availability, Beach) %>% summarise(AvgPrice = floor(mean(price)))
## Warning: package 'bindrcpp' was built under R version 3.4.3
new_list <- new_list[1:(nrow(new_list)-2),]
p <- ggplot(new_list, aes(x = Availability, y = AvgPrice)) +
geom_bar(
aes(color = Beach, fill = Beach),
stat = "identity", position = position_dodge(0.8),
width = 0.7
) +
scale_color_manual(values = c("#0073C2FF", "#EFC000FF")) +
scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) + ylim(c(0,400)) + ggtitle("Airbnb Room Price By Availability and Beach") + theme(plot.title = element_text(hjust = 0.5))
p + geom_text(
aes(label = AvgPrice, group = Beach),
position = position_dodge(0.8),
vjust = -0.3, size = 3.5
)

# Measure how price of Airbnb rooms in Catalina islands are affected by availability
listing$Availability <- cut(availability_365, breaks=c(0,72,146,220,300,330,365), labels = c("0-72","73-146","147-220","221-300","301-330","331-365"))
table(listing$Availability, exclude = NULL)
##
## 0-72 73-146 147-220 221-300 301-330 331-365 <NA>
## 6515 7026 5586 3516 3046 11320 7495
new_list2 <- listing %>% group_by(Availability, Catalina) %>% summarise(AvgPrice = floor(mean(price)))
new_list2 <- new_list2[1:(nrow(new_list2)-2),]
p2 <- ggplot(new_list2, aes(x = Availability, y = AvgPrice)) +
geom_bar(
aes(color = Catalina, fill = Catalina),
stat = "identity", position = position_dodge(0.8),
width = 0.7
) +
scale_color_manual(values = c("#0073C2FF", "#EFC000FF")) +
scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) + ylim(c(0,700)) + ggtitle("Airbnb Room Price By Availability and Catalina") + theme(plot.title = element_text(hjust = 0.5))
p2 + geom_text(
aes(label = AvgPrice, group = Catalina),
position = position_dodge(0.8),
vjust = -0.8, size = 3.5
)
